TFLearn Subject Verb Agreement Error Detection 2

This notebook is based off the original fragment detection notebook, but specific to detection of participle phrase fragments.

As our training data we will use 799,675 correct sentences and, of a total 12,743,496 sentences with subject verb agreement errors, we will use a randomly chosen 799,675.

The labels will be either a 1 or 0, where 1 indicates a sentence with a subject verb agreement error and 0 indicates there is no subject verb agreement errors.

Because some libraries used require python 2.7, this jupyter notebook may not be able to run but it is used to document process.

Install Dependencies


In [ ]:
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
import re
from textstat.textstat import textstat
from pattern.en import lexeme, tenses
from pattern.en import pluralize, singularize
import sqlite3
import hashlib

nlp = spacy.load('en_core_web_lg')
conn = sqlite3.connect('db/mangled_agreement.db')
cursor = conn.cursor()
#from nltk.util import ngrams, trigrams
#import csv
#import pandas as pd

Load Datafiles


In [ ]:
# TODO: This is kind of memory intensive don'tcha think?
texts = []
labels = []

# add 0 label to correct sentences
for row in cursor.execute("SELECT sentence FROM orignal_sentences"):
    texts.append(row[0].strip())
    labels.append(0)

# add 1 label to sentences with a subject verb agreement error, limit should match the number of original sentences
for row in cursor.execute("SELECT sentence FROM mangled_sentences ORDER BY RANDOM() LIMIT 799675"):
    texts.append(row[0].strip())
    labels.append(1)
        
print(texts[-10:])
conn.close() # done with sqlite connection
Shuffle the data

In [ ]:
import random

combined = list(zip(texts,labels))
random.shuffle(combined)

texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])
Get verb phrase keys for sentence

In [ ]:
def get_verb_phrases(sentence_doc):
    """
    Returns an object like,
    
        [(1), (5,6,7)]
        
    where this means 2 verb phrases. a single verb at index 1, another verb phrase 5,6,7.  
    
     - Adverbs are not included.
     - Infinitive phrases (and verb phrases that are subsets of infinitive phrases) are not included
     
    """ 
    pattern =  r'<VERB>*<ADV>*<VERB>+' #  r'<VERB>?<ADV>*<VERB>+' is suggested by textacy site
    verb_phrases = textacy.extract.pos_regex_matches(sentence_doc, pattern)
    sentence_str = sentence_doc.text
    
    index_2_word_no = {} # the starting position for each word to its number{0:0, 3:1, 7:2, 12:3}
    for word in sentence_doc:
        
    result = [] # [(1), (5,6,7)] => 2 verb phrases. a single verb at index 1, another verb phrase 5,6,7
    for vp in verb_phrases:
        word_numbers = []
        # return the index of 'could have been happily eating' from 'She could have been happily eating chowder'
        str_idx = sentence_str.index(vp.text)
        first_word = index_2_word_no[str_idx] # word number for first word of verb phrase
        
        x = first_word
        if len(vp) > 1:
            for verb_or_adverb in vp:
                # filter out adverbs
                if not verb_or_adverb.pos_ == 'ADV':
                    word_numbers.append(x)
                x += 1
        else:
            word_numbers.append(first_word)
        
        # filter out infinitive phrases
        if ( (word_numbers[0] - 1) < 0) or (doc[word_numbers[0] - 1].text.lower() != 'to'):
            result.append(word_numbers)
    
    return result


def singular_or_plural(word_string):
    if word_string == singularize(word_string):
        return 'SG'
    else:
        return 'PL'

def sentence_to_keys(sentence):
    doc = textacy.Doc(sentence, lang='en_core_web_lg')
    
    # [(1), (5,6,7)] => 2 verb phrases. a single verb at index 1, another verb phrase 5,6,7
    verb_phrases = get_verb_phrases(doc)
    
    # doc = this could be my sentence
    # doc_list = [this, -595002753822348241, 15488046584>THIS, my sentence]
    # final_keys = [-595002753822348241:15488046584>THIS]
    #
    # doc = Jane is only here for tonight
    # doc_list = [Jane, 13440080745121162>SG, only, here, for, tonight ]
    # final_keys = [13440080745121162>SG]
    doc_list = []
    for word in doc:
            
        if word.pos_ == 'VERB':
            tense_hash = hashlib.sha256((str(tenses(word.text)))).hexdigest()
            verb_number_or_pronoun = ''
            for child in word.children:
                if child.dep_ == 'nsubj':
                    if child.pos == 'PRON':
                        verb_number_or_pronoun = child.text.upper()
                    else:
                        verb_number_or_pronoun = singular_or_plural(child.text)
                    break
        
            doc_list.append(tense_hash + '>' + verb_number_or_pronoun)
        else:
            doc_list.append(word.text)
    
    # Get final keys
    final_keys = []
    for vp in verb_phrases:
        vp_key_list = []
        for word_no in vp:
            vp_key_list.append(doc_list[word_no])
        vp_key = ':'.join(vp_key_list)
        final_keys.append(vp_key)
    
    return final_keys
    
    

sentence_to_keys(texts[3])

Key counts


In [ ]:
from collections import Counter

c = Counter()

for textString in texts:
    c.update(sentence_to_keys(textString)))

total_counts = c

print("Total words in data set: ", len(total_counts))

In [ ]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])

In [ ]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

Take the trigrams and index them


In [ ]:
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)

In [ ]:
def text_to_vector(text):
    wordVector = np.zeros(len(vocab))
    for word in sentence_to_keys(text):
        index = word2idx.get(word, None)
        if index != None:
            wordVector[index] += 1
    return wordVector

In [ ]:
text_to_vector('Donald, standing on the precipice, began to dance.')[:65]

In [ ]:
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
    word_vectors[ii] = text_to_vector(text)

In [ ]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

Chunking the data for TF


In [ ]:
records = len(labels)
test_fraction = 0.9

train_split, test_split = int(records*test_fraction), int(records*(1-test_fraction))
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)

In [ ]:
trainX[-1], trainY[-1]

In [ ]:
len(trainY), len(testY), len(trainY) + len(testY)

Setting up TF


In [ ]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, len(vocab)])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model

In [ ]:
len(vocab)

Initialize


In [ ]:
model = build_model()

Training


In [ ]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)

In [ ]:
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

In [ ]:
w = csv.writer(open("../models/subjectverbagreementindex.csv", "w"))
for key, val in word2idx.items():
    w.writerow([key, val])

In [ ]:
model.save("../models/subject_verb_agreement_model.tfl")

Playground


In [ ]:
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence)])[0][1]
    print('Is this a participle phrase fragment?\n {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Yes' if positive_prob > 0.5 else 'No')

In [ ]:
test_sentence("Neglecting to recognize the horrors those people endure allow people to go to war more easily.")

In [ ]:
test_sentence("Katherine, gesticulating wildly and dripping in sweat, kissed him on the cheek.")

In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat.")

In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat, she slowly grew tired.")

In [ ]:
test_sentence("Rushing to the rescue with his party.")

In [ ]:
test_sentence("Isobel was about thirteen now, and as pretty a girl, according to Buzzby, as you could meet with in any part of Britain.")

In [ ]:
test_sentence("Being of a modest and retiring disposition, Mr. Hawthorne avoided publicity.")

In [ ]:
test_sentence("Clambering to the top of a bridge, he observed a great rainbow")

In [ ]:
test_sentence("Clambering to the top of a bridge.")

In [ ]:
test_sentence("He observed a great rainbow.")

In [ ]:
test_sentence("Sitting on the iron throne, Joffry looked rather fat.")

In [ ]:
test_sentence("Worrying that a meteor or chunk of space debris will conk her on the head.")

In [ ]:
test_sentence("Aunt Olivia always wears a motorcycle helmet, worrying that a meteor or chunk of space debris will conk her on the head")

In [ ]:
test_sentence("Affecting the lives of many students in New York City.")

In [ ]:
test_sentence("Quill was a miracle, affecting the lives of many students in New York City.")

In [ ]:
test_sentence("Standing on the edge of the cliff looking down.")

In [ ]:
test_sentence("Emilia, standing on the edge of the cliff and looking down, began to weep.")

In [ ]:
test_sentence("Standing on the edge of the cliff and looking down, Emilia began to weep.")

In [ ]:
test_sentence("Tired and needing sleep.")

Save the vocab


In [ ]:
vocab

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: